metadata = read.csv("https://github.com/borevitzlab/brachy-genotyping/raw/master/metadata/brachy-metadata.csv")
#str(metadata)
spp = read.csv("data/species_id.csv")
#str(spp)
#snpgdsVCF2GDS("data/2017-11-30_bhybridum_filtered_default.vcf.gz",
#              "data/2017-11-30_bhybridum_filtered_default.gds")
geno = snpgdsOpen("data/2017-11-30_bhybridum_filtered_default.gds",
                  allow.duplicate = T, readonly = T)
samp = snpgdsSummary(geno)$sample.id
## The file name: /home/kevin/ws/brachy-genotyping-notes/data/2017-11-30_bhybridum_filtered_default.gds 
## The total number of samples: 1968 
## The total number of SNPs: 1278299 
## SNP genotypes are stored in SNP-major mode (Sample X SNP).

Filter out very bad missing data

snp.missfilt = snpgdsSelectSNP(geno, missing.rate=0.999, autosome.only=F)
## Excluding 51,452 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: 0.999)

The total number of SNPs remaining is 1226847.

Species subset

Directly subset the three species’ genotype matricies.

chrom = read.gdsn(index.gdsn(geno, "snp.chromosome"))[snp.missfilt]
snp.dis = snp.missfilt[grep('Bd21-3', chrom)]
snp.sta = snp.missfilt[grep('Bstacei', chrom)]
snp.all = snp.missfilt
samp.hyb = filter(spp, species.id == "hybridum")$sample
samp.dis = filter(spp, species.id == "distachyon")$sample
samp.sta = filter(spp, species.id == "stacei")$sample

Functions for further filtering

These things are in common for each species of brachy, so turn them into functions.

ssp.filt = function(geno, samps, snps,  max.snp.miss.rate=0.99,
                    max.samp.miss.rate=0.99, min.maf=0.001 ) {
    miss.samp = snpgdsSampMissRate(geno, snp.id=snps, sample.id=samps)
    
    hist(miss.samp, breaks=100, main="Sample Missing Data (pre-filt)")
    abline(v=max.samp.miss.rate, col="blue", lwd=2)
    
    samps = samps[miss.samp <= max.samp.miss.rate]
    
    srf = snpgdsSNPRateFreq(geno, sample.id=samps, snp.id = snps)
    miss.snp = srf$MissingRate
    hist(miss.snp, breaks=100, main="SNP missing data")
    abline(v=max.snp.miss.rate, col="blue", lwd=2)
    
    maf = srf$MinorFreq
    hist(maf, breaks=50, main="SNP MAF")
    abline(v=min.maf, lwd=2, col="blue")
    
    snps = snpgdsSelectSNP(geno, sample.id=samps, snp.id=snps, maf=min.maf,
                                     missing.rate=max.snp.miss.rate, autosome.only=F)
    
    miss.samp = snpgdsSampMissRate(geno, snp.id=snps, sample.id=samps)
    hist(miss.samp, breaks=100, main="Sample Missing Data (post-filt)")
    
    print(paste("Num SNPs:", length(snps)))
    print(paste("Num Samples:", length(samps)))
    return(list(snps=snps, samps=samps, miss.samp=miss.samp))
}

ssp.geno = function(geno, filt) {
    ibs = snpgdsIBS(geno, sample.id=filt$samps, snp.id=filt$snps,
                    autosome.only=F, num.thread=4)
    ibs.nacnt = rowSums(is.na(ibs$ibs))
    table(ibs.nacnt)
    return(ibs)
}

distimpute = function(ibs, thresh=5, maxdist = 1) {
    dist = 1 - ibs$ibs 
    nasum = colSums(is.na(dist))
    pass = nasum < thresh
    dist = dist[pass,pass]
    
    ibs$sample.id = ibs$sample.id[pass]
    
    dist.ut = dist
    dist.ut[upper.tri(dist.ut,diag=T)] = 0
    nasum = colSums(is.na(dist.ut))
    
    # impute
    for (j in which(nasum > 0)) {
        for (i in which(is.na(dist[,j]))) {
            k = which(dist.ut[,j] == max(dist.ut[,j], na.rm=T))
            if (length(k) > 1) {
                k = k[1]
            }
            if (dist[k, j] <= maxdist) {
                dist[i, j] = max(dist[k, j], # k, j is neighbour -> self
                                 dist[k, i]) # k, i is neighbour -> other
                dist[j, i] = max(dist[k, j], # k, j is neighbour -> self
                                 dist[k, i]) # k, i is neighbour -> other
            }
        }
    }
    
    ibs$ibs = 1 - dist
    return(ibs)
}


distimpute2 = function(ibs, max.NAs=0, max.dist = 0.2) {
    dist = 1 - ibs$ibs 
    
    dist.ut = dist
    dist.ut[upper.tri(dist.ut,diag=T)] = 0
    nasum = colSums(is.na(dist.ut))
    num.imputed = 0
    # impute
    for (j in which(nasum > 0)) {
        for (i in which(is.na(dist[,j]))) {
            k = which(dist.ut[,j] == max(dist.ut[,j], na.rm=T))
            if (length(k) > 1) {
                k = k[1]
            }
            if (dist[k, j] <= max.dist) {
                num.imputed = num.imputed + 1
                dist[i, j] = max(dist[k, j], # k, j is neighbour -> self
                                 dist[k, i]) # k, i is neighbour -> other
                dist[j, i] = max(dist[k, j], # k, j is neighbour -> self
                                 dist[k, i]) # k, i is neighbour -> other
            }
        }
    }

    num.removed = 0
    while (sum(is.na(dist)) > 0) {
        rm = which.max(colSums(is.na(dist)))
        dist = dist[-rm,]
        dist = dist[,-rm]
        ibs$sample.id = ibs$sample.id[-rm]
        num.removed = num.removed + 1
    }
    ibs$ibs = 1 - dist
    ibs$num.imputed = num.imputed
    ibs$num.removed = num.removed
    print(paste("Num imputed:", num.imputed))
    print(paste("Num removed:", num.removed))
    return(ibs)
}

Distachyon

samp.rils = metadata %>%
    filter(grepl("^RIL", accession)) %>% 
    select(anon.name) %>%
    unlist
samp.dis.sansrils = samp.dis[! samp.dis %in% samp.rils]

filt.dis = ssp.filt(geno, samp.dis.sansrils, snp.dis, min.maf=0.01,
                    max.samp.miss.rate = 0.995, max.snp.miss.rate=0.95)

## Excluding 751,055 SNPs (monomorphic: TRUE, MAF: 0.01, missing rate: 0.95)

## [1] "Num SNPs: 81400"
## [1] "Num Samples: 514"
ibs.dis = ssp.geno(geno, filt.dis)
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## Working space: 514 samples, 81,400 SNPs
##     using 4 (CPU) cores
## IBS:    the sum of all selected genotypes (0,1,2) = 6296601
## Thu Dec 14 16:03:36 2017    (internal increment: 30336)
## 
[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed in 1s
## Thu Dec 14 16:03:37 2017    Done.
ibs.dis.imp = distimpute2(ibs.dis, max.dist = 0.2)
## [1] "Num imputed: 0"
## [1] "Num removed: 11"
dim(ibs.dis.imp$ibs)
## [1] 503 503

After the distance interpolation, we have interpolated 0 entries, and have removed 11 samples.

Find optimal z threshold

Run over a range of z thresholds.

grouping = metadata %>% 
    select(accession, anon.name)

for (zthresh in c(seq(1, 4, 0.5), 5:10)) {
    cut.dis = snpgdsHCluster(ibs.dis.imp) %>% 
        snpgdsCutTree(outlier.n=0, z.threshold = zthresh)
    s = cut.dis$sample.id
    grouping = grouping %>% 
        filter(anon.name %in% cut.dis$sample.id)
    d = cut.dis$samp.group[match(grouping$anon.name, s)]
    snpgdsDrawTree(cut.dis)
    
    grouping[[paste0("GroupAt", zthresh, "_Ngrps", length(table(d)))]] = d
}
## Determine groups by permutation (Z threshold: 1, outlier threshold: 0):
## Create 298 groups.

## Determine groups by permutation (Z threshold: 1.5, outlier threshold: 0):
## Create 209 groups.

## Determine groups by permutation (Z threshold: 2, outlier threshold: 0):
## Create 165 groups.

## Determine groups by permutation (Z threshold: 2.5, outlier threshold: 0):
## Create 132 groups.

## Determine groups by permutation (Z threshold: 3, outlier threshold: 0):
## Create 104 groups.

## Determine groups by permutation (Z threshold: 3.5, outlier threshold: 0):
## Create 74 groups.

## Determine groups by permutation (Z threshold: 4, outlier threshold: 0):
## Create 60 groups.

## Determine groups by permutation (Z threshold: 5, outlier threshold: 0):
## Create 49 groups.

## Determine groups by permutation (Z threshold: 6, outlier threshold: 0):
## Create 42 groups.

## Determine groups by permutation (Z threshold: 7, outlier threshold: 0):
## Create 37 groups.

## Determine groups by permutation (Z threshold: 8, outlier threshold: 0):
## Create 29 groups.

## Determine groups by permutation (Z threshold: 9, outlier threshold: 0):
## Create 16 groups.

## Determine groups by permutation (Z threshold: 10, outlier threshold: 0):
## Create 16 groups.

write.csv(grouping, "out/group_zscores.csv", row.names = F)

Plot draft dendrogram

best.z.thresh = 2.5
ibs.dis.plt = ibs.dis.imp
ibs.dis.plt$sample.id = paste(metadata$anon.name[match(ibs.dis.imp$sample.id, metadata$anon.name)],
                              metadata$accession[match(ibs.dis.imp$sample.id, metadata$anon.name)])
                              

hc.dis.plt = snpgdsHCluster(ibs.dis.plt) %>% 
    snpgdsCutTree(outlier.n=1, z.threshold = best.z.thresh, label.H = F, label.Z = F)
## Determine groups by permutation (Z threshold: 2.5, outlier threshold: 1):
## Create 131 groups.
snpgdsDrawTree(hc.dis.plt, leaflab="perpendicular")

Remove dodgy samples

dodgy = read.csv("data/dodgy-samples.csv")$anon.name
samp.dis.genocall = samp.dis.sansrils[! samp.dis.sansrils %in% dodgy]

filt.dis.geno = ssp.filt(geno, samp.dis.genocall, snp.dis, min.maf=0.01,
                         max.samp.miss.rate = 0.995, max.snp.miss.rate=0.95)

## Excluding 749,655 SNPs (monomorphic: TRUE, MAF: 0.01, missing rate: 0.95)

## [1] "Num SNPs: 82800"
## [1] "Num Samples: 495"
ibs.dis = ssp.geno(geno, filt.dis.geno)
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## Working space: 495 samples, 82,800 SNPs
##     using 4 (CPU) cores
## IBS:    the sum of all selected genotypes (0,1,2) = 6233397
## Thu Dec 14 16:04:13 2017    (internal increment: 31488)
## 
[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed in 0s
## Thu Dec 14 16:04:13 2017    Done.
ibs.dis.imp = distimpute2(ibs.dis, max.dist = 0.2)
## [1] "Num imputed: 0"
## [1] "Num removed: 5"
dim(ibs.dis.imp$ibs)
## [1] 490 490

Call genotypes

best.z.thresh = 2.5
best.z.hc = snpgdsHCluster(ibs.dis.imp) %>% 
    snpgdsCutTree(outlier.n=1, z.threshold = best.z.thresh, label.Z = F)
## Determine groups by permutation (Z threshold: 2.5, outlier threshold: 1):
## Create 134 groups.
snpgdsDrawTree(best.z.hc)

sort(table(best.z.hc$samp.group), decreasing = T)
## 
##       G079       G018       G019       G012       G015       G052 
##         16         10         10          9          9          9 
##       G082       G025       G037       G077       G084       G013 
##          9          8          8          8          8          7 
##       G046       G102       G105       G107       G002       G004 
##          7          7          7          7          6          6 
##       G005       G024       G042       G059       G061       G074 
##          6          6          6          6          6          6 
##       G081       G083       G092       G009       G010       G014 
##          6          6          6          5          5          5 
##       G029       G030       G036       G041       G048       G063 
##          5          5          5          5          5          5 
##       G068       G090       G098       G003       G008       G011 
##          5          5          5          4          4          4 
##       G020       G021       G026       G027       G028       G038 
##          4          4          4          4          4          4 
##       G039       G043       G045       G047       G057       G058 
##          4          4          4          4          4          4 
##       G069       G071       G072       G085       G087       G093 
##          4          4          4          4          4          4 
##       G096       G099       G023       G033       G049       G050 
##          4          4          3          3          3          3 
##       G055       G056       G060       G070       G073       G076 
##          3          3          3          3          3          3 
##       G086       G088       G101       G103       G104       G106 
##          3          3          3          3          3          3 
##       G001       G006       G007       G016       G017       G022 
##          2          2          2          2          2          2 
##       G031       G032       G034       G035       G040       G044 
##          2          2          2          2          2          2 
##       G051       G053       G054       G062       G064       G065 
##          2          2          2          2          2          2 
##       G066       G067       G075       G078       G080       G089 
##          2          2          2          2          2          2 
##       G091       G094       G095       G097       G100 Outlier001 
##          2          2          2          2          2          1 
## Outlier002 Outlier003 Outlier004 Outlier005 Outlier006 Outlier007 
##          1          1          1          1          1          1 
## Outlier008 Outlier009 Outlier010 Outlier011 Outlier012 Outlier013 
##          1          1          1          1          1          1 
## Outlier014 Outlier015 Outlier016 Outlier017 Outlier018 Outlier019 
##          1          1          1          1          1          1 
## Outlier020 Outlier021 Outlier022 Outlier023 Outlier024 Outlier025 
##          1          1          1          1          1          1 
## Outlier026 Outlier027 
##          1          1
goodgroups = data.frame(best.z.hc[c("sample.id", "samp.group")]) %>% 
    filter(grepl("^G", samp.group))
genocall = metadata %>% 
    select(accession, anon.name) %>% 
    right_join(goodgroups, by=c("anon.name"="sample.id"))
## Warning: Column `anon.name`/`sample.id` joining factors with different
## levels, coercing to character vector
genocall$missing.rate = snpgdsSampMissRate(geno, sample.id=as.character(genocall$anon.name),
                                           snp.id = filt.dis$snps)
write.csv(genocall, "out/genotypes.csv", row.names = F)

Now, we take the indiviudal run with the least missing data from each clonal lineage

genocall.best = genocall %>% 
    group_by(samp.group) %>% 
    summarise(anon.name = anon.name[which.min(missing.rate)],
              accession = accession[which.min(missing.rate)],
              missing.rate = min(missing.rate))

write.csv(genocall.best, "out/genotypes_best.csv", row.names = F)
samps = genocall.best$anon.name
snps = filt.dis$snps
max.snp.miss.rate = 0.9
min.maf = 0.05

srf = snpgdsSNPRateFreq(geno, sample.id=samps, snp.id=snps)
miss.snp = srf$MissingRate
hist(miss.snp, breaks=100, main="SNP missing data")
abline(v=max.snp.miss.rate, lwd=2, col="blue")

maf = srf$MinorFreq
hist(maf, breaks=50, main="SNP MAF")
abline(v=min.maf, lwd=2, col="blue")

snps = snpgdsSelectSNP(geno, sample.id=samps, snp.id=snps, maf=min.maf,
                       missing.rate=max.snp.miss.rate, autosome.only=F)
## Excluding 56,652 SNPs (monomorphic: TRUE, MAF: 0.05, missing rate: 0.9)
srf.post = snpgdsSNPRateFreq(geno, sample.id=samps, snp.id=snps)
hist(srf.post$MissingRate, breaks=100, main="SNP missing data (post)")

print(paste("Num SNPs:", length(snps)))
## [1] "Num SNPs: 24748"
print(paste("Num Samples:", length(samps)))
## [1] "Num Samples: 107"
struct.genos = snpgdsGetGeno(geno, sample.id = samps, snp.id=snps)
## Genotype matrix: 107 samples X 24748 SNPs
hist(colSums(struct.genos, na.rm = T))

table(colSums(struct.genos, na.rm = T)==2)
## 
## FALSE  TRUE 
## 24420   328
hist(colSums(is.na(struct.genos))/nrow(struct.genos))

rownames(struct.genos) = samps
struct.genos[struct.genos==1] = NA
write.table(struct.genos, "out/structure_genos_bdis.txt", na="-9",
            sep="\t", quote = F, col.names = F)
ibs.dis.plt = ibs.dis.imp
ibs.dis.plt$sample.id = paste(metadata$anon.name[match(ibs.dis.imp$sample.id, metadata$anon.name)],
                              metadata$accession[match(ibs.dis.imp$sample.id, metadata$anon.name)])
                              

hc.dis.plt = snpgdsHCluster(ibs.dis.plt) %>% 
    snpgdsCutTree(outlier.n=1, z.threshold = best.z.thresh, label.H = F, label.Z = F)
## Determine groups by permutation (Z threshold: 2.5, outlier threshold: 1):
## Create 133 groups.
snpgdsDrawTree(hc.dis.plt, leaflab="perpendicular")

Hybridum and stacei

filt.hyb = ssp.filt(geno, samp.hyb, snp.all, min.maf=0.01,
                    max.samp.miss.rate = 0.99, max.snp.miss.rate=0.97)

## Excluding 1,122,425 SNPs (monomorphic: TRUE, MAF: 0.01, missing rate: 0.97)

## [1] "Num SNPs: 104422"
## [1] "Num Samples: 692"
ibs.hyb = ssp.geno(geno, filt.hyb)
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## Working space: 692 samples, 104,422 SNPs
##     using 4 (CPU) cores
## IBS:    the sum of all selected genotypes (0,1,2) = 8083459
## Thu Dec 14 16:04:26 2017    (internal increment: 22528)
## 
[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed in 2s
## Thu Dec 14 16:04:28 2017    Done.
ibs.hyb.imp = distimpute2(ibs.hyb, max.dist = 0.2)
## [1] "Num imputed: 0"
## [1] "Num removed: 13"
table(colSums(is.na(ibs.hyb.imp$ibs)))
## 
##   0 
## 679
snpgdsHCluster(ibs.hyb.imp) %>% 
    snpgdsCutTree() %>% 
    snpgdsDrawTree(outlier.n = 0)
## Determine groups by permutation (Z threshold: 15, outlier threshold: 5):
## Create 23 groups.

filt.sta = ssp.filt(geno, samp.sta, snp.sta, min.maf=0.01,
                    max.samp.miss.rate = 0.99, max.snp.miss.rate=0.97)

## Excluding 334,637 SNPs (monomorphic: TRUE, MAF: 0.01, missing rate: 0.97)

## [1] "Num SNPs: 59755"
## [1] "Num Samples: 56"
ibs.sta = ssp.geno(geno, filt.sta)
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## Working space: 56 samples, 59,755 SNPs
##     using 4 (CPU) cores
## IBS:    the sum of all selected genotypes (0,1,2) = 328781
## Thu Dec 14 16:04:32 2017    (internal increment: 65536)
## 
[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed in 0s
## Thu Dec 14 16:04:32 2017    Done.
ibs.sta.imp = distimpute2(ibs.hyb, max.dist = 0.2)
## [1] "Num imputed: 0"
## [1] "Num removed: 13"
table(colSums(is.na(ibs.sta.imp$ibs)))
## 
##   0 
## 679
snpgdsHCluster(ibs.sta.imp) %>% 
    snpgdsCutTree(outlier.n=0) %>% 
    snpgdsDrawTree()
## Determine groups by permutation (Z threshold: 15, outlier threshold: 0):
## Create 23 groups.